In [ ]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "notebook"
In [ ]:
categories = [
    'frontpage',
    'news',
    'tech',
    'local',
    'opinion',
    'on-air',
    'misc',
    'weather',
    'msn-news',
    'health',
    'living',
    'business',
    'msn-sports',
    'sports',
    'summary',
    'bbs',
    'travel'
]
In [ ]:
def parse_dataset(path='./data/msnbc/data.seq'):
    with open(path) as f:
        user = 1
        data = []

        for line in f:
            split_line = line.split()
            user_cateogories = [categories[int(i) - 1] for i in split_line]

            data.append(np.array(user_cateogories, dtype=object))
            user += 1

    return np.array(data, dtype=object)

def get_refresh_data(category, user_visits):
    """
    Sliding window approach to find the longest/shortest
    continuous subsequence in O(n) time
    """
    longest, shortest, start, end = 0, len(user_visits) + 1, 0, len(user_visits)
    current, refreshes = 0, 0

    while start < end:
        if user_visits[start] == category:
            current += 1
        else:
            if current >= 2:
                current -= 1 #substract the initial visit
                refreshes += current
                longest = current if current > longest else longest
                shortest = current if current < shortest else shortest
                current = 0

        start += 1

    if current >= 2:
        current -= 1 #substract the initial visit
        refreshes += current
        longest = current if current > longest else longest
        shortest = current if current < shortest else shortest
    
    return refreshes, longest, shortest if shortest < len(user_visits) + 1 else 0

def get_revisit_data(category, user_visits):
    revisits, longest, shortest = 0, 0, len(user_visits) + 1
    previous = -1

    for current, visit in enumerate(user_visits):
        if visit == category:
            if previous == -1:
                previous = current
            else:
                distance = current - previous
                previous = current

                if distance > 1: #not a refresh
                    distance -= 1 #substract the initial visit
                    revisits += 1
                    longest = distance if distance > longest else longest
                    shortest = distance if distance < shortest else shortest

    return revisits, longest, shortest if shortest < len(user_visits) + 1 else 0

def get_dataset_stats(data):
    categories_stats = {}

    for category in tqdm(categories):
        categories_stats[category] = {
            'visits': 0,           #1 2 2 1, here 1 and 2 were visited both 2 times
            'unique_visits': 0,    #1 2 2 1, here 1 and 2 were visited both 1 time
            'refreshes': 0,        #1 1 1, here 1 counts as refreshed 2 times
            'unique_refreshes': 0, #refreshes counted once per user
            'revisits': 0,         #1 2 2 1, here 1 counts as revisited, 2 as refreshed
            'unique_revisits': 0,  #revisits counted once per user 
            'longest_refresh': 0,  #1 1 1 1, here 1 was refreshed 4 times
            'shortest_refresh': 0, 
            'longest_revisit': 0,  #1 2 2 1, revisit distance of 2 sites for 1
            'shortest_revisit': 0,
        }

        for user_visits in data:
            categories_stats[category]['visits'] += (user_visits == category).sum()
            categories_stats[category]['unique_visits'] += 1 if category in user_visits else 0

            refresh_data = get_refresh_data(category, user_visits)
            categories_stats[category]['refreshes'] += refresh_data[0]
            categories_stats[category]['unique_refreshes'] += (1 if refresh_data[0] > 0 else 0)
            
            if refresh_data[1] > categories_stats[category]['longest_refresh']:
                categories_stats[category]['longest_refresh'] = refresh_data[1]

            if refresh_data[2] != 0 and (refresh_data[2] < categories_stats[category]['shortest_refresh']\
                or categories_stats[category]['shortest_refresh'] == 0):
                categories_stats[category]['shortest_refresh'] = refresh_data[2]

            revisit_data = get_revisit_data(category, user_visits)
            categories_stats[category]['revisits'] += revisit_data[0]
            categories_stats[category]['unique_revisits'] += (1 if revisit_data[0] > 0 else 0)

            if revisit_data[1] > categories_stats[category]['longest_revisit']:
                categories_stats[category]['longest_revisit'] = revisit_data[1]

            if revisit_data[2] != 0 and (revisit_data[2] < categories_stats[category]['shortest_revisit']\
                or categories_stats[category]['shortest_revisit'] == 0):
                categories_stats[category]['shortest_revisit'] = revisit_data[2]

    return categories_stats
        
def plot_categories_statistics(categories, categories_stats):
    max_rows, max_cols = 9, 2
    row, col = 1, 1

    fig = make_subplots(
        rows=max_rows, cols=max_cols,
        subplot_titles=categories)

    for category in categories:
        X, Y = [], []

        for key in categories_stats[category]:
            X.append(key)
            Y.append(categories_stats[category][key])

        fig.add_trace(
            go.Histogram(histfunc='sum', x=X, y=Y, name=category),
            row=row, col=col
        )

        if col % max_cols == 0:
            col = 0
            row += 1

        col += 1

    fig.update_layout(
        title_text='Categories statistics',
        bargap=0.1,
        height=max_rows * 300,
        width=max_cols * 650
    )

    fig.show()

def plot_statistics_categories(categories, categories_stats):
    max_rows, max_cols = 10, 1
    row, col = 1, 1

    statistics = []
    for key in categories_stats[categories[0]]:
        statistics.append(key)

    fig = make_subplots(
        rows=max_rows, cols=max_cols,
        subplot_titles=statistics)

    for statistic in statistics:
        X, Y = [], []

        for category in categories:
            X.append(category)
            Y.append(categories_stats[category][statistic])

        fig.add_trace(
            go.Histogram(histfunc='sum', x=X, y=Y, name=statistic),
            row=row, col=col
        )

        if col % max_cols == 0:
            col = 0
            row += 1

        col += 1

    fig.update_layout(
        title_text='Statistics categories',
        bargap=0.1,
        height=max_rows * 300,
        width=max_cols * 1200
    )

    fig.show()
In [ ]:
data = parse_dataset('./data/msnbc/data.seq')
data[:5]
Out[ ]:
array([array(['frontpage', 'frontpage'], dtype=object),
       array(['news'], dtype=object),
       array(['tech', 'news', 'news', 'local', 'news', 'news', 'news', 'tech',
              'tech'], dtype=object)                                          ,
       array(['opinion'], dtype=object),
       array(['frontpage'], dtype=object)], dtype=object)
In [ ]:
dataset_stats = get_dataset_stats(data)
dataset_stats
100%|██████████| 17/17 [04:21<00:00, 15.38s/it]
Out[ ]:
{'frontpage': {'visits': 940469,
  'unique_visits': 313181,
  'refreshes': 526123,
  'unique_refreshes': 183510,
  'revisits': 208524,
  'unique_revisits': 106246,
  'longest_refresh': 14794,
  'shortest_refresh': 1,
  'longest_revisit': 1783,
  'shortest_revisit': 1},
 'news': {'visits': 452387,
  'unique_visits': 175286,
  'refreshes': 247557,
  'unique_refreshes': 87452,
  'revisits': 54761,
  'unique_revisits': 32303,
  'longest_refresh': 725,
  'shortest_refresh': 1,
  'longest_revisit': 583,
  'shortest_revisit': 1},
 'tech': {'visits': 207479,
  'unique_visits': 121948,
  'refreshes': 80048,
  'unique_refreshes': 36169,
  'revisits': 13543,
  'unique_revisits': 9683,
  'longest_refresh': 2057,
  'shortest_refresh': 1,
  'longest_revisit': 775,
  'shortest_revisit': 1},
 'local': {'visits': 386217,
  'unique_visits': 121719,
  'refreshes': 245416,
  'unique_refreshes': 58429,
  'revisits': 39607,
  'unique_revisits': 22843,
  'longest_refresh': 534,
  'shortest_refresh': 1,
  'longest_revisit': 1431,
  'shortest_revisit': 1},
 'opinion': {'visits': 151409,
  'unique_visits': 24987,
  'refreshes': 122160,
  'unique_refreshes': 15213,
  'revisits': 7435,
  'unique_revisits': 4453,
  'longest_refresh': 439,
  'shortest_refresh': 1,
  'longest_revisit': 1102,
  'shortest_revisit': 1},
 'on-air': {'visits': 414928,
  'unique_visits': 217101,
  'refreshes': 173252,
  'unique_refreshes': 70929,
  'revisits': 53845,
  'unique_revisits': 33531,
  'longest_refresh': 1257,
  'shortest_refresh': 1,
  'longest_revisit': 551,
  'shortest_revisit': 1},
 'misc': {'visits': 305615,
  'unique_visits': 80514,
  'refreshes': 197506,
  'unique_refreshes': 58746,
  'revisits': 47584,
  'unique_revisits': 28909,
  'longest_refresh': 124,
  'shortest_refresh': 1,
  'longest_revisit': 1431,
  'shortest_revisit': 1},
 'weather': {'visits': 439398,
  'unique_visits': 95615,
  'refreshes': 335018,
  'unique_refreshes': 67143,
  'revisits': 14548,
  'unique_revisits': 9319,
  'longest_refresh': 218,
  'shortest_refresh': 1,
  'longest_revisit': 1511,
  'shortest_revisit': 1},
 'msn-news': {'visits': 196614,
  'unique_visits': 90192,
  'refreshes': 95537,
  'unique_refreshes': 43656,
  'revisits': 25863,
  'unique_revisits': 17718,
  'longest_refresh': 289,
  'shortest_refresh': 1,
  'longest_revisit': 2325,
  'shortest_revisit': 1},
 'health': {'visits': 131760,
  'unique_visits': 50606,
  'refreshes': 74147,
  'unique_refreshes': 21950,
  'revisits': 13246,
  'unique_revisits': 7933,
  'longest_refresh': 96,
  'shortest_refresh': 1,
  'longest_revisit': 575,
  'shortest_revisit': 1},
 'living': {'visits': 96817,
  'unique_visits': 57597,
  'refreshes': 35703,
  'unique_refreshes': 17398,
  'revisits': 8680,
  'unique_revisits': 6149,
  'longest_refresh': 78,
  'shortest_refresh': 1,
  'longest_revisit': 776,
  'shortest_revisit': 1},
 'business': {'visits': 264899,
  'unique_visits': 112183,
  'refreshes': 141175,
  'unique_refreshes': 46468,
  'revisits': 22880,
  'unique_revisits': 14312,
  'longest_refresh': 893,
  'shortest_refresh': 1,
  'longest_revisit': 938,
  'shortest_revisit': 1},
 'msn-sports': {'visits': 216125,
  'unique_visits': 76948,
  'refreshes': 129522,
  'unique_refreshes': 47516,
  'revisits': 20169,
  'unique_revisits': 14823,
  'longest_refresh': 440,
  'shortest_refresh': 1,
  'longest_revisit': 2098,
  'shortest_revisit': 1},
 'sports': {'visits': 395880,
  'unique_visits': 119138,
  'refreshes': 263167,
  'unique_refreshes': 72063,
  'revisits': 24150,
  'unique_revisits': 15525,
  'longest_refresh': 310,
  'shortest_refresh': 1,
  'longest_revisit': 839,
  'shortest_revisit': 1},
 'summary': {'visits': 56576,
  'unique_visits': 29200,
  'refreshes': 22110,
  'unique_refreshes': 9890,
  'revisits': 13926,
  'unique_revisits': 7078,
  'longest_refresh': 824,
  'shortest_refresh': 1,
  'longest_revisit': 1822,
  'shortest_revisit': 1},
 'bbs': {'visits': 25249,
  'unique_visits': 2082,
  'refreshes': 22510,
  'unique_refreshes': 1631,
  'revisits': 1022,
  'unique_revisits': 419,
  'longest_refresh': 729,
  'shortest_refresh': 1,
  'longest_revisit': 1342,
  'shortest_revisit': 1},
 'travel': {'visits': 16972,
  'unique_visits': 11006,
  'refreshes': 5417,
  'unique_refreshes': 2925,
  'revisits': 1510,
  'unique_revisits': 1048,
  'longest_refresh': 19,
  'shortest_refresh': 1,
  'longest_revisit': 2299,
  'shortest_revisit': 1}}
In [ ]:
plot_categories_statistics(categories, dataset_stats)
In [ ]:
plot_statistics_categories(categories, dataset_stats)